
###############################
## 1. CLUSTER GENERATION
###############################


library(fastLink)
library(data.table)
library(here)

i_am("1_cluster_generation.R")
rm(list=ls())

# set primary directory and folders
destination <- here()
temp.folder <- paste0(destination,'temporary_files/')

# set working directory
setwd(destination) 

#### load MTO dataset as data.frame
#mto.full <- #<READ MTO DATASET HERE> 


## create new folder for small clusters
dir.create(paste0(temp.folder,'small_clusters/'))

gender.cat <- c("male","female")
size <- 2500000 # this determines the maximum cluster size. Can be adjusted according to match time/success
mto.clusters <- NULL
cluster.stats.small <- data.frame(cat=c('male','female'),n.clusters=rep(NA,2)) # holder for cluster summaries
cluster.details.small <- NULL

for(cat in gender.cat){
  # start by reading in first names for gender
  load(paste0('./l2_first_names_', cat,'.RData'))
  
  # subset MTO dat to gender category to determine clusters
  mto.sub <- mto.dat[which(mto.dat$gender==substring(cat,1,1)),]
  
  # now determine clusters based on first names of MTO and L2 subsets
  cluster.out <- clusterMatch(vecA = mto.sub$first, vecB = voter.first, max.n=size)
  voter.clusters <- cluster.out$clusterB
  mto.sub$cluster <- cluster.out$clusterA
  save(mto.sub,file=paste0(temp.folder,"small_clusters/small_",'mto_clusters_',cat,'.RData'))
  cluster.stats.small$n.clusters[cluster.stats.small$cat==cat] <- cluster.out$n.clusters
  
  rm(list=c('voter.first','cluster.out')) # clean memory
  
  # load rest of L2 data for matching and save subsets by cluster to reduce pairwise comparisons
  load(paste0('l2_match_vars_',cat,'.RData'))
  
  # save each L2 cluster subset and temporary MTO cluster subset
  for(cluster in 1:length(unique(voter.clusters))){
    cat(cluster)
    voter.sub <- voter.full[voter.clusters==cluster,]
    save(voter.sub, file=paste0(temp.folder,"small_clusters/small_",paste('l2',cat,cluster,sep='_'),'.RData'))
    
    # cluster details
    mto.cluster <- data.table(subset(mto.sub, cluster == cluster))
    details <- data.frame(gender=cat,cluster=cluster,l2=nrow(voter.sub),mto=nrow(mto.cluster),
                          letters=paste(unique(substring(voter.sub$first,1,1)), 
                                        collapse="|"))
    cluster.details.small <- rbind(cluster.details.small, details)
    cat('done\n')
    rm('voter.sub','mto.cluster')
  }
  cat(cat,'done\n')
}

# save cluster stats
print(cluster.stats.small)
print(cluster.details.small)
save(cluster.stats.small, file='./output/small_cluster_stats.RData')
save(cluster.details.small, file="./output/small_cluster_details.RData")

